knitr::opts_chunk$set(warning = FALSE, message = FALSE, fig.align='center')Analysis: Clustifyr example
an example showing the use of clustifyr to identify cell types
1 Notes
Here we will load the 10X Genomics PBMC dataset, process it in the standard Seurat tutorial method and then demonstrate the use clustifyr to identify cell types.
The clustifyr tutorial can be found here: https://www.bioconductor.org/packages/release/bioc/vignettes/clustifyr/inst/doc/clustifyr.html
1.1 Samples
Dataset of Peripheral Blood Mononuclear Cells (PBMC) freely available from 10X Genomics. There are 2,700 single cells that were sequenced on the Illumina NextSeq 500. The raw data can be found here: https://cf.10xgenomics.com/samples/cell/pbmc3k/pbmc3k_filtered_gene_bc_matrices.tar.gz
2 Environment
library(dplyr)
library(tidyr)
library(purrr)
library(Seurat)
library(patchwork)
library(clustifyr)
library(ComplexHeatmap)
library(DT)
set.seed(87645893)3 Load data
# Load the PBMC dataset
pbmc.data <- Read10X(data.dir = "filtered_gene_bc_matrices/hg19/")
so <- CreateSeuratObject(counts = pbmc.data, project = "pbmc3k", min.cells = 3, min.features = 200)
soAn object of class Seurat
13714 features across 2700 samples within 1 assay
Active assay: RNA (13714 features, 0 variable features)
1 layer present: counts
4 Preprocess data
so[["percent.mt"]] <- PercentageFeatureSet(so, pattern = "^MT-")
VlnPlot(so, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)plot1 <- FeatureScatter(so, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(so, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 + plot2# Filter cells based on nCount_RNA, nFeature_RNA, and percent.mt
so <- subset(so, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5)5 Normalize data
# Normalize the data
so <- NormalizeData(so, normalization.method = "LogNormalize", scale.factor = 10000)6 Identify variable features
# Identify the 2,000 most highly variable features
so <- FindVariableFeatures(so, selection.method = "vst", nfeatures = 2000)
top10 <- head(VariableFeatures(so), 10)
plot1 <- VariableFeaturePlot(so)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
plot1 + plot27 Scale data
# Scale the data
so <- ScaleData(so, features = rownames(so))8 Perform PCA
# Perform PCA
so <- RunPCA(so, features = VariableFeatures(object = so))
print(so[["pca"]], dims = 1:2, ncol = 2)PC_ 1
Positive: CST3, TYROBP, LST1, AIF1, FTL, FTH1, LYZ, FCN1, S100A9, TYMP
FCER1G, CFD, LGALS1, S100A8, CTSS, LGALS2, SERPINA1, IFITM3, SPI1, CFP
Negative: MALAT1, LTB, IL32, IL7R, CD2, B2M, ACAP1, CD27, STK17A, CTSW
CD247, GIMAP5, AQP3, CCL5, SELL, TRAF3IP3, GZMA, MAL, CST7, ITM2A
PC_ 2
Positive: CD79A, MS4A1, TCL1A, HLA-DQA1, HLA-DQB1, HLA-DRA, LINC00926, CD79B, HLA-DRB1, CD74
HLA-DMA, HLA-DPB1, HLA-DQA2, CD37, HLA-DRB5, HLA-DMB, HLA-DPA1, FCRLA, HVCN1, LTB
Negative: NKG7, PRF1, CST7, GZMB, GZMA, FGFBP2, CTSW, GNLY, B2M, SPON2
CCL4, GZMH, FCGR3A, CCL5, CD247, XCL2, CLIC3, AKR1C3, SRGN, HOPX
VizDimLoadings(so, dims = 1:2, reduction = "pca")DimPlot(so, reduction = "pca")DimHeatmap(so, dims = 1:15, cells = 500, balanced = TRUE)9 Determine the dimensionality of the data
ElbowPlot(so)10 Cluster the cells
# Cluster the cells
so <- FindNeighbors(so, dims = 1:10)
so <- FindClusters(so, resolution = 0.5)Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 2638
Number of edges: 95927
Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.8728
Number of communities: 9
Elapsed time: 0 seconds
so <- RunUMAP(so, dims = 1:10)
DimPlot(so, reduction = "umap", label = TRUE)11 Clustifyr: Using a gene marker list
This examples uses the PanglaoDB marker gene list to identify cell types in the PBMC dataset.
11.1 Load marker gene list
# load panglaodb markers tsv
markers <- read.csv("../marker-gene-lists/PanglaoDB_markers_27_Mar_2020.tsv", sep = "\t")
datatable(markers, options = list(pageLength = 5))11.2 Filter for human genes
#filter for human genes; species column contains Hs for human
human_markers <- markers[grepl("Hs", markers$species),]11.3 Convert to dataframe with columns as cell type and rows as gene names
This is the format that clustifyr expects the marker gene list to be in. NAs should pad where there are fewer genes for a given cell type.
# convert to dataframe with columns as cell type and rows as gene names
gene_list <- human_markers %>%
select(cell.type, official.gene.symbol) %>%
distinct() %>%
group_split(cell.type) %>%
setNames(map_chr(., ~ .x$cell.type[1])) %>%
map(~ .x$official.gene.symbol)
# Pad to equal length and bind as dataframe
max_len <- max(lengths(gene_list))
cell_type_markers <- gene_list %>%
map(~ c(.x, rep(NA, max_len - length(.x)))) %>%
bind_cols()
datatable(cell_type_markers, options = list(pageLength = 10))11.4 Optionally filter for cell types of interest
# Optionally filter column names for cell types of interest
cell_type_markers <- cell_type_markers %>%
select("B cell", "T cell", "NK cell", "Monocyte", "Macrophage", "Dendritic cell", "Mast cell", "Neutrophil", "Eosinophil", "Basophil", "Fibroblast", "Endothelial")11.5 Run clustify_lists() to generate correlation matrix
# Available metrics include: "hyper", "jaccard", "spearman", "gsea"
list_res <- clustify_lists(
input = so, # matrix of normalized single-cell RNA-seq counts
cluster_col = "RNA_snn_res.0.5", # name of column in meta.data containing cell clusters
marker = cell_type_markers, # list of known marker genes
marker_inmatrix = FALSE,
metric = "jaccard", # test to use for assigning cell types
obj_out = FALSE # return Seurat object
)11.6 Plot heatmap of correlation matrix
plot_cor_heatmap(
cor_mat = list_res, # matrix of correlation coefficients from clustify_lists()
cluster_rows = TRUE, # cluster by row
cluster_columns = TRUE, # cluster by column
legend_title = "jaccard" # title of heatmap legend
)datatable(list_res, options = list(pageLength = 5))11.7 Assign cell types to Seurat object using clustify_lists()
so_res <- clustify_lists(
input = so, # matrix of normalized single-cell RNA-seq counts
cluster_col = "RNA_snn_res.0.5", # name of column in meta.data containing cell clusters
marker = cell_type_markers, # list of known marker genes
marker_inmatrix = FALSE,
metric = "jaccard", # test to use for assigning cell types
obj_out = TRUE # return Seurat object
)
# clustifyr stores the cell type assignments in the metadata column "type"
so_res <- SetIdent(so_res, value = "type")
DimPlot(so_res, reduction="umap", label = TRUE, , pt.size = 0.2) + NoLegend()12 Clustifyr: Using an existing annotated Seurat object
12.1 Load reference Seurat object
# Load the Seurat object
so_ref <- readRDS("so_ref.rds")
so_refAn object of class Seurat
13714 features across 2638 samples within 1 assay
Active assay: RNA (13714 features, 2000 variable features)
3 layers present: counts, data, scale.data
2 dimensional reductions calculated: pca, umap
datatable(so_ref@meta.data)12.2 Build reference matrix from previously annotated Seurat object
ref_mat <- seurat_ref(
seurat_object = so_ref,
cluster_col = "cell_types"
)12.3 Run clustify directly on your unannotated Seurat object
so_res2 <- clustify(
input = so, # unannotated Seurat object
ref_mat = ref_mat, # matrix of average expression per reference cell type
cluster_col = "seurat_clusters", # or whatever your cluster column is
obj_out = FALSE # return Seurat object with metadata updated
)
plot_cor_heatmap(
cor_mat = so_res2,
cluster_rows = TRUE,
cluster_columns = TRUE)so_res2 <- clustify(
input = so, # unannotated Seurat object
ref_mat = ref_mat, # matrix of average expression per reference cell type
cluster_col = "seurat_clusters", # or whatever your cluster column is
obj_out = TRUE # return Seurat object with metadata updated
)
# clustifyr stores the cell type assignments in the metadata column "type"
so_res2 <- SetIdent(so_res2, value = "type")
DimPlot(so_res2, reduction="umap", label = TRUE, pt.size = 0.2) + NoLegend()13 Session info
sessionInfo()R version 4.4.2 (2024-10-31)
Platform: aarch64-apple-darwin20
Running under: macOS Sequoia 15.3.2
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
time zone: America/Denver
tzcode source: internal
attached base packages:
[1] grid stats graphics grDevices datasets utils methods
[8] base
other attached packages:
[1] DT_0.33 ComplexHeatmap_2.22.0 clustifyr_1.18.0
[4] patchwork_1.3.0 Seurat_5.2.1 SeuratObject_5.0.2
[7] sp_2.2-0 purrr_1.0.4 tidyr_1.3.1
[10] dplyr_1.1.4
loaded via a namespace (and not attached):
[1] RcppAnnoy_0.0.22 splines_4.4.2
[3] later_1.4.1 tibble_3.2.1
[5] polyclip_1.10-7 fastDummies_1.7.5
[7] lifecycle_1.0.4 doParallel_1.0.17
[9] globals_0.16.3 lattice_0.22-6
[11] MASS_7.3-61 crosstalk_1.2.1
[13] magrittr_2.0.3 sass_0.4.9
[15] plotly_4.10.4 rmarkdown_2.29
[17] jquerylib_0.1.4 yaml_2.3.10
[19] httpuv_1.6.15 sctransform_0.4.1
[21] spam_2.11-1 spatstat.sparse_3.1-0
[23] reticulate_1.41.0.1 cowplot_1.1.3
[25] pbapply_1.7-2 RColorBrewer_1.1-3
[27] abind_1.4-8 zlibbioc_1.52.0
[29] Rtsne_0.17 GenomicRanges_1.58.0
[31] BiocGenerics_0.52.0 circlize_0.4.16
[33] GenomeInfoDbData_1.2.13 IRanges_2.40.1
[35] S4Vectors_0.44.0 ggrepel_0.9.6
[37] irlba_2.3.5.1 listenv_0.9.1
[39] spatstat.utils_3.1-3 goftest_1.2-3
[41] RSpectra_0.16-2 spatstat.random_3.3-3
[43] fitdistrplus_1.2-2 parallelly_1.43.0
[45] codetools_0.2-20 DelayedArray_0.32.0
[47] tidyselect_1.2.1 shape_1.4.6.1
[49] UCSC.utils_1.2.0 farver_2.1.2
[51] matrixStats_1.5.0 stats4_4.4.2
[53] spatstat.explore_3.4-2 jsonlite_1.9.1
[55] GetoptLong_1.0.5 progressr_0.15.1
[57] ggridges_0.5.6 survival_3.7-0
[59] iterators_1.0.14 foreach_1.5.2
[61] tools_4.4.2 ica_1.0-3
[63] Rcpp_1.0.14 glue_1.8.0
[65] gridExtra_2.3 SparseArray_1.6.2
[67] xfun_0.51 MatrixGenerics_1.18.1
[69] GenomeInfoDb_1.42.3 withr_3.0.2
[71] BiocManager_1.30.25 fastmap_1.2.0
[73] entropy_1.3.1 digest_0.6.37
[75] R6_2.6.1 mime_0.13
[77] colorspace_2.1-1 Cairo_1.6-2
[79] scattermore_1.2 tensor_1.5
[81] spatstat.data_3.1-6 generics_0.1.3
[83] renv_1.1.1 data.table_1.17.0
[85] httr_1.4.7 htmlwidgets_1.6.4
[87] S4Arrays_1.6.0 uwot_0.2.3
[89] pkgconfig_2.0.3 gtable_0.3.6
[91] lmtest_0.9-40 SingleCellExperiment_1.28.1
[93] XVector_0.46.0 htmltools_0.5.8.1
[95] dotCall64_1.2 fgsea_1.32.2
[97] clue_0.3-66 scales_1.3.0
[99] Biobase_2.66.0 png_0.1-8
[101] spatstat.univar_3.1-2 knitr_1.50
[103] rstudioapi_0.17.1 reshape2_1.4.4
[105] rjson_0.2.23 nlme_3.1-166
[107] cachem_1.1.0 zoo_1.8-13
[109] GlobalOptions_0.1.2 stringr_1.5.1
[111] KernSmooth_2.23-24 parallel_4.4.2
[113] miniUI_0.1.1.1 vipor_0.4.7
[115] ggrastr_1.0.2 pillar_1.10.1
[117] vctrs_0.6.5 RANN_2.6.2
[119] promises_1.3.2 xtable_1.8-4
[121] cluster_2.1.6 beeswarm_0.4.0
[123] evaluate_1.0.3 cli_3.6.4
[125] compiler_4.4.2 rlang_1.1.5
[127] crayon_1.5.3 future.apply_1.11.3
[129] labeling_0.4.3 plyr_1.8.9
[131] ggbeeswarm_0.7.2 stringi_1.8.4
[133] viridisLite_0.4.2 deldir_2.0-4
[135] BiocParallel_1.40.0 munsell_0.5.1
[137] lazyeval_0.2.2 spatstat.geom_3.3-6
[139] Matrix_1.7-1 RcppHNSW_0.6.0
[141] future_1.34.0 ggplot2_3.5.1
[143] shiny_1.10.0 SummarizedExperiment_1.36.0
[145] ROCR_1.0-11 igraph_2.1.4
[147] bslib_0.9.0 fastmatch_1.1-6